ggplotly() and plot_ly()plot_geo()We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.
sb_locs <- read_csv("starbucks-locations.csv")
## Rows: 25600 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Brand, Store Number, Store Name, Ownership Type, Street Address, C...
## dbl (2): Longitude, Latitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sb_nutr <- read_csv("starbucks-menu-nutrition.csv")
## Rows: 205 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Item, Category
## dbl (5): Calories, Fat (g), Carb. (g), Fiber (g), Protein (g)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa_pop <- read_csv("us_state_pop.csv")
## Rows: 55 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): state
## dbl (1): population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa_states<-read_csv("states.csv")
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Abbreviation
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sb_locs)
## # A tibble: 6 × 13
## Brand `Store Number` `Store Name` `Ownership Type` `Street Address` City
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Starbucks 47370-257954 Meritxell, 96 Licensed Av. Meritxell, … Ando…
## 2 Starbucks 22331-212325 Ajman Drive … Licensed 1 Street 69, Al… Ajman
## 3 Starbucks 47089-256771 Dana Mall Licensed Sheikh Khalifa … Ajman
## 4 Starbucks 22126-218024 Twofour 54 Licensed Al Salam Street Abu …
## 5 Starbucks 17127-178586 Al Ain Tower Licensed Khaldiya Area, … Abu …
## 6 Starbucks 17688-182164 Dalma Mall, … Licensed Dalma Mall, Mus… Abu …
## # ℹ 7 more variables: `State/Province` <chr>, Country <chr>, Postcode <chr>,
## # `Phone Number` <chr>, Timezone <chr>, Longitude <dbl>, Latitude <dbl>
str(sb_locs)
## spc_tbl_ [25,600 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Brand : chr [1:25600] "Starbucks" "Starbucks" "Starbucks" "Starbucks" ...
## $ Store Number : chr [1:25600] "47370-257954" "22331-212325" "47089-256771" "22126-218024" ...
## $ Store Name : chr [1:25600] "Meritxell, 96" "Ajman Drive Thru" "Dana Mall" "Twofour 54" ...
## $ Ownership Type: chr [1:25600] "Licensed" "Licensed" "Licensed" "Licensed" ...
## $ Street Address: chr [1:25600] "Av. Meritxell, 96" "1 Street 69, Al Jarf" "Sheikh Khalifa Bin Zayed St." "Al Salam Street" ...
## $ City : chr [1:25600] "Andorra la Vella" "Ajman" "Ajman" "Abu Dhabi" ...
## $ State/Province: chr [1:25600] "7" "AJ" "AJ" "AZ" ...
## $ Country : chr [1:25600] "AD" "AE" "AE" "AE" ...
## $ Postcode : chr [1:25600] "AD500" NA NA NA ...
## $ Phone Number : chr [1:25600] "376818720" NA NA NA ...
## $ Timezone : chr [1:25600] "GMT+1:00 Europe/Andorra" "GMT+04:00 Asia/Dubai" "GMT+04:00 Asia/Dubai" "GMT+04:00 Asia/Dubai" ...
## $ Longitude : num [1:25600] 1.53 55.47 55.47 54.38 54.54 ...
## $ Latitude : num [1:25600] 42.5 25.4 25.4 24.5 24.5 ...
## - attr(*, "spec")=
## .. cols(
## .. Brand = col_character(),
## .. `Store Number` = col_character(),
## .. `Store Name` = col_character(),
## .. `Ownership Type` = col_character(),
## .. `Street Address` = col_character(),
## .. City = col_character(),
## .. `State/Province` = col_character(),
## .. Country = col_character(),
## .. Postcode = col_character(),
## .. `Phone Number` = col_character(),
## .. Timezone = col_character(),
## .. Longitude = col_double(),
## .. Latitude = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
distinct(sb_locs, Country) |> pull(Country)
## [1] "AD" "AE" "AR" "AT" "AU" "AW" "AZ" "BE" "BG" "BH" "BN" "BO" "BR" "BS" "CA"
## [16] "CH" "CL" "CN" "CO" "CR" "CW" "CY" "CZ" "DE" "DK" "EG" "ES" "FI" "FR" "GB"
## [31] "GR" "GT" "HU" "ID" "IE" "IN" "JO" "JP" "KH" "KR" "KW" "KZ" "LB" "LU" "MA"
## [46] "MC" "MX" "MY" "NL" "NO" "NZ" "OM" "PA" "PE" "PH" "PL" "PR" "PT" "QA" "RO"
## [61] "RU" "SA" "SE" "SG" "SK" "SV" "TH" "TR" "TT" "TW" "US" "VN" "ZA"
head(sb_nutr)
## # A tibble: 6 × 7
## Item Category Calories `Fat (g)` `Carb. (g)` `Fiber (g)` `Protein (g)`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Chonga Bagel Food 300 5 50 3 12
## 2 8-Grain Roll Food 380 6 70 7 10
## 3 Almond Croi… Food 410 22 45 3 10
## 4 Apple Fritt… Food 460 23 56 2 7
## 5 Banana Nut … Food 420 22 52 2 6
## 6 Blueberry M… Food 380 16 53 1 6
str(sb_nutr)
## spc_tbl_ [205 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Item : chr [1:205] "Chonga Bagel" "8-Grain Roll" "Almond Croissant" "Apple Fritter" ...
## $ Category : chr [1:205] "Food" "Food" "Food" "Food" ...
## $ Calories : num [1:205] 300 380 410 460 420 380 420 240 350 320 ...
## $ Fat (g) : num [1:205] 5 6 22 23 22 16 17 12 22 16 ...
## $ Carb. (g) : num [1:205] 50 70 45 56 52 53 61 28 38 36 ...
## $ Fiber (g) : num [1:205] 3 7 3 2 2 1 2 1 0 1 ...
## $ Protein (g): num [1:205] 12 10 10 7 6 6 5 5 2 8 ...
## - attr(*, "spec")=
## .. cols(
## .. Item = col_character(),
## .. Category = col_character(),
## .. Calories = col_double(),
## .. `Fat (g)` = col_double(),
## .. `Carb. (g)` = col_double(),
## .. `Fiber (g)` = col_double(),
## .. `Protein (g)` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(usa_pop)
## # A tibble: 6 × 2
## state population
## <chr> <dbl>
## 1 Alabama 4779736
## 2 Alaska 710231
## 3 Arizona 6392017
## 4 Arkansas 2915918
## 5 California 37253956
## 6 Colorado 5029196
str(usa_pop)
## spc_tbl_ [55 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ state : chr [1:55] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ population: num [1:55] 4779736 710231 6392017 2915918 37253956 ...
## - attr(*, "spec")=
## .. cols(
## .. state = col_character(),
## .. population = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(usa_states)
## # A tibble: 6 × 2
## State Abbreviation
## <chr> <chr>
## 1 Alabama AL
## 2 Alaska AK
## 3 Arizona AZ
## 4 Arkansas AR
## 5 California CA
## 6 Colorado CO
str(usa_states)
## spc_tbl_ [51 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ State : chr [1:51] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Abbreviation: chr [1:51] "AL" "AK" "AZ" "AR" ...
## - attr(*, "spec")=
## .. cols(
## .. State = col_character(),
## .. Abbreviation = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
sb_usa <- sb_locs |> filter(Country == "US")
sb_locs_state <- sb_usa |>
group_by(`State/Province`) |>
rename(Abbreviation = `State/Province`) |>
summarize(n_stores = n())
# need state abbreviations
usa_pop_abbr <- usa_states |>
full_join(usa_pop,
by = c("State" = "state"))
sb_locs_state <- full_join(sb_locs_state, usa_pop_abbr, by = "Abbreviation")
sb_locs_state
## # A tibble: 55 × 4
## Abbreviation n_stores State population
## <chr> <int> <chr> <dbl>
## 1 AK 49 Alaska 710231
## 2 AL 85 Alabama 4779736
## 3 AR 55 Arkansas 2915918
## 4 AZ 488 Arizona 6392017
## 5 CA 2821 California 37253956
## 6 CO 481 Colorado 5029196
## 7 CT 123 Connecticut 3574097
## 8 DC 91 District of Columbia 601723
## 9 DE 25 Delaware 897934
## 10 FL 694 Florida 18801310
## # ℹ 45 more rows
summary(sb_locs_state)
## Abbreviation n_stores State population
## Length:55 Min. : 8.0 Length:55 Min. : 56882
## Class :character 1st Qu.: 56.5 Class :character 1st Qu.: 1344331
## Mode :character Median : 123.0 Mode :character Median : 3751351
## Mean : 266.8 Mean : 5677621
## 3rd Qu.: 332.0 3rd Qu.: 6515716
## Max. :2821.0 Max. :37253956
## NA's :4
ggplotly for EDAAnswer the following questions:
Are the number of Starbucks proportional to the population of a state? (scatterplot)
Is the caloric distribution of Starbucks menu items different for drinks and food? (histogram)
What are the top 20 words in Starbucks menu items? (bar plot)
p1 <- ggplot(sb_locs_state, aes(x = population, y = n_stores)) +
geom_point() +
theme_minimal() +
labs(x = "Population", y = "Number of Starbucks Stores", title = "Starbucks Stores vs. State Population") +
geom_smooth()
ggplotly(p1)
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
The number of Starbucks is possitively proportional to the population of a state.
p2 <- ggplot(sb_nutr, aes(x = Calories, fill = Category)) +
geom_histogram(position = "identity", alpha = 0.8, bins = 30) +
theme_minimal() +
labs(x = "Calories", y = "n_stores", title = "Caloric Distribution of Starbucks Menu Items")
ggplotly(p2)
The caloric distribution of Starbucks menu items is different for drinks and food. Food tends to have higher calories.
p3<- sb_nutr |>
unnest_tokens(word, Item) |>
count(word, sort = TRUE) |>
head(20) |>
ggplot(aes(x = reorder(word, n), y = n))+
geom_col()+
coord_flip()+
labs(x = "Frequency", y = "Word", title = "Top 20 Words in Starbucks Menu Items")
ggplotly(p3)
plot_ly()plot_ly() representing the
relationship between calories and carbscal_carbs_plot <- sb_nutr |>
plot_ly(x = ~Calories, y = ~`Carb. (g)`,
type = 'scatter',
mode = 'markers',
color = ~Category,
marker = list(sizemode='diameter')) |>
layout(title = "Relationship between Calories and Carbs",
xaxis = list(title = "Calories"),
yaxis = list(title = "Carbs"))
cal_carbs_plot
hovermode = "compare"top_10 <- sb_nutr |>
unnest_tokens(word, Item) |>
count(word, sort = TRUE) |>
head(10) |>
select(word) |>
unlist()
sb_nutr_top_words <- sb_nutr |>
mutate(contains_word = str_detect(Item, regex(paste(top_10, collapse = "|"), ignore_case = TRUE))) |>
filter(contains_word == TRUE)
cal_carb_top_words <- sb_nutr_top_words |>
plot_ly(x = ~Calories, y = ~`Carb. (g)`,
type = 'scatter',
mode = 'markers',
color = ~Category,
hoverinfo = 'text',
text = ~paste("Item:", Item),
marker = list(sizemode='diameter')) |>
layout(title = "Calories vs Carbs for Items with Top 10 Words ",
xaxis = list(title = "Calories"),
yaxis = list(title = "Carbs"),
hovermode = "compare")
cal_carb_top_words
plot_ly Boxplotsidentify_top_words <- function(item_name, top_words) {
contained_words <- top_words[sapply(top_words, function(word) str_detect(item_name, regex(word, ignore_case = TRUE)))]
paste(contained_words, collapse = ", ")
}
with_words <- sb_nutr_top_words |>
mutate(TopWordsContained = sapply(Item, identify_top_words, top_words = top_10)) |>
separate_rows(TopWordsContained, sep = ",\\s*")
with_words
with_words_long <- with_words |>
pivot_longer(
cols = c(`Fat (g)`, `Carb. (g)`, `Fiber (g)`, `Protein (g)`),
names_to = "Nutrient",
values_to = "Value")
plot_ly(data = with_words_long,
x = ~TopWordsContained,
y = ~Value, type = 'box',
color = ~Nutrient) |>
layout(yaxis = list(title = 'Nutritional Value'),
xaxis = list(title = 'Top Words'),
title = 'Nutritional Content by Top Words')
sb_nutr_top_words |>
plot_ly(x = ~Calories, y = ~`Carb. (g)`, z = ~`Protein (g)`,
type = 'scatter3d', mode = 'markers',
sizes = c(5, 70),
marker = list(sizemode='area', opacity=0.5)) |>
layout(title = '3D Scatterplot of Nutrition Facts for Top Word Items',
scene = list(xaxis = list(title = 'Calories'),
yaxis = list(title = 'Carbs'),
zaxis = list(title = 'Protein')))
Higher calories come with higher carbs and higher protein. The three variables are positively correlated.
plot_ly Map# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('steelblue')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create hover text
sb_locs_state$hover <- with(sb_locs_state, paste("Number of Starbucks: ", n_stores, '<br>', "State: ", Abbreviation, '<br>', "Population: ", population))
usa_pop_abbr$hover <- with(usa_pop_abbr, paste("State: ", Abbreviation, '<br>', "Population: ", population))
# Create the map
map1 <- plot_geo(sb_locs_state, locationmode = 'USA-states') |>
add_trace(
z = ~n_stores, locations = ~Abbreviation,
hoverinfo = "text",
text = ~hover,
colors = 'Purples',
marker = list(line = list(color = 'rgb(255,255,255)', width = 2))
) |>
colorbar(title = "Stores") |>
layout(title = 'Starbucks Stores per State', geo = set_map_details)
map1
map2 <- plot_geo(usa_pop_abbr, locationmode = 'USA-states') |>
add_trace(
z = ~population, locations = ~Abbreviation,
hoverinfo = "text",
text = ~hover,
colors = 'Greens',
marker = list(line = list(color = 'rgb(255,255,255)', width = 2))
) |>
colorbar(title = "Population") |>
layout(title = 'Population by State', geo = set_map_details)
map2
subplot(map1, map2)